#!/bin/bash 
# 
# Usage: The script generate:   
#	 coverage: how many reads fallen into exonic part 
#	 exclusion : how many reads spliced out for a certain exonic part  
#        calculate PSI( percentage splicing index) : The usage of a certain exonic part 
# Output:
#	 prefix_exon.coverage: The coverage count for each exonic part 
#	 prefix_exon.exclusion: The spliced read count for each exonic part     
#        prefix_exon.psi: The psi value for each exonic part 
#	 prefix_exon.gff: The exonic part annotation without aggregate gene feature
#
# Author: 
#	Miao kui, kui.miao@duke-nus.edu.sg , Duke-NUS Graduate Medical School Singapore (Duke-NUS)
#	Sebastian Schafer, sebastian.schaefer@nhcs.com.sg, National Heart Centre Singapore
# 
# Please cite "Schafer, S., Miao, K., Benson, C.C., Heinig, M., Cook, S.A., and Hubner, N. 2015. Alternative splicing signatures in RNA-seq data: percent spliced in (PSI). Curr. Protoc. Hum. Genet. doi: 10.1002/0471142905.hg1116s87"
#
# version:1.0 


## print out help information
function showHelp { 
cat << EOF
Usage: PSI.sh Command 
Command: 
	CountInclusion 		Counting exon inclusion coverage  
		Usage: PSI.sh CountInclusion <annotation.gtf> <alignment_file.bam> <baseName>
	    	  annotation.gtf	 The gtf annotation file generated by dexseq_prepare_annotation.py
	    	  alignment_file.bam	 alignment_file.bam file 
	    	  baseName               baseName of output file
  	  
	CountExclusion 		Counting exon exclusion 
		Usage: PSI.sh CountExclusion <annotation> <junctions_file> <baseName>
	    	  annotation             The gtf annotation generated by dexseq_prepare_annotation.py
	    	  junctions_file         bed format junctionfile generated by tophat/tophat2
	    	  baseName               baseName of oupput file
  	
	getPSI			Calculating PSI value 
		Usage: PSI.sh getPSI <readLength> <inclustion.txt> <exclusion.txt> <baseName>
                  readlength          the length of the short read, which will be used for exon count correction 
 	    	  inclusion.txt          exon inclusion counts generated by CountInclusion 
	    	  exclusion.txt          exon exclusion counts generated by CountExclusion
	    	  baseName               baseName of output file

	StartPSI 		Calculating PSI from the beginning     
	   	Usage: PSI.sh StartPSI  <annotation.gtf> <readLength> <alignment_file.bam> <junctions_file> <baseName>
    
	StartPSINoFilter 		Calculating PSI from the beginning without filtering for junctions falling into known exons      
	   	Usage: PSI.sh StartPSINoFilter  <annotation.gtf> <readLength> <alignment_file.bam> <junctions_file> <baseName>    

For details see: "Schafer, S., et al. 2015. Alternative splicing signatures in RNA-seq data: percent spliced in (PSI). Curr. Protoc. Hum. Genet."

EOF
exit 1 
} 


## Counting exon inclusion coverage for each exonic part. 
function CountInclusion {
	echo "Counting exon coverage...."
	
	## extracting the exon annotation from input annotation file, and simplify the attribute field as "geneID:exonNumer" to track the exon 
	[ -f ${prefix}_exonic_parts.gff ] || awk '{OFS="\t"}{if ($3 == "exonic_part") print  $1,$2,$3,$4,$5,$6,$7,$8,$14":"$12}' $annotation | sed 's@[";]@@g' > ${prefix}_exonic_parts.gff
	
	## counting the exon coverage  sort based on exon ID 
	coverageBed -split -abam $reads -b ${prefix}_exonic_parts.gff | awk 'BEGIN{OFS = "\t"}{ print $1,$4,$5,$5-$4+1,$9,$10 }' | sort -k 5 > ${prefix}_exonic_parts.inclusion
	
	echo "Exon coverage counting finished!"
	echo 
} 

## Filtering junction. Filter out the junction records which was not overlaped with any of the anotated exonic part. 
function junctionFilter {
        cat $junctions | sed 's/,/\t/g' | awk 'BEGIN{OFS="\t"}{print $1,$2,$2+$13,$4,$5,$6}' > ${prefix}_left.bed
        cat $junctions | sed 's/,/\t/g' | awk 'BEGIN{OFS="\t"}{print $1,$3-$14,$3,$4,$5,$6}' > ${prefix}_right.bed
        intersectBed -u -s -a ${prefix}_left.bed -b ${prefix}_exonic_parts.gff > ${prefix}_left.overlap
        intersectBed -u -s -a ${prefix}_right.bed -b ${prefix}_exonic_parts.gff > ${prefix}_right.overlap
        cat ${prefix}_left.overlap ${prefix}_right.overlap | cut -f4 | sort | uniq -c | awk '{ if($1 == 2) print $2 }' > ${prefix}_filtered_junctions.txt
        grep -F -f ${prefix}_filtered_junctions.txt $junctions > ${prefix}_filtered_junctions.bed	
        rm ${prefix}_left.bed ${prefix}_right.bed ${prefix}_left.overlap ${prefix}_right.overlap ${prefix}_filtered_junctions.txt
}	


## Counting exon exclusion 
function CountExclusion {
	echo "Counting exclusion...."
	echo "junctions file is: $junctions"
	[ -f ${prefix}_exonic_parts.gff ] || awk '{OFS="\t"}{if ($3 == "exonic_part") print  $1,$2,$3,$4,$5,$6,$7,$8,$14":"$12}' $annotation | sed 's@[";]@@g' > ${prefix}_exonic_parts.gff
	##  We create an intron list from Tophat junctions
	grep -v description $junctions | sed 's/,/\t/g' | awk '{OFS="\t"}{print $1,$2+$13,$3-$14,$4,$5,$6}' > ${prefix}_intron.bed

	##  Extract all Introns belonging to an exon and summarize read counts for each exon
	intersectBed -wao -f 1.0 -s -a ${prefix}_exonic_parts.gff -b ${prefix}_intron.bed | awk 'BEGIN{OFS = "\t"}{ $16 == 0? s[$9] += 0:s[$9] += $14 }END{ for (i in s) {print i,s[i]} }' | sort -k 1 > ${prefix}_exonic_parts.exclusion
	rm  ${prefix}_intron.bed	
	echo "Exclusion counting finished! "
	echo 
}


## Calculating the PSI value based on inclusion and exclusion number 
function CountPSI {
	echo "Calculating PSI value..."

	## checking the inclusion and exclusion input file, if the exonID are not sorted or different, then exit with status 9 
	cut -f5 $inclusion > ${prefix}_exonID1.txt 
	cut -f1 $exclusion > ${prefix}_exonID2.txt 
	diff ${prefix}_exonID1.txt ${prefix}_exonID2.txt > /dev/null  ||( echo "Unsorted exonID exit" &&  return 9 )
	rm ${prefix}_exonID1.txt ${prefix}_exonID2.txt 
	
	## Calculating PSI 
	paste  $inclusion $exclusion | awk -v "len=$readLength" -v "prefix=$prefix" 'BEGIN{OFS = "\t"; print "exon_ID",prefix"_length",prefix"_inclusion",prefix"_exclusion",prefix"_PSI" }{NIR=$6/($4+len-1);NER=$8/(len-1)}{print $5,$4,$6,$8,(NIR+NER==0)? "NA":NIR/(NIR + NER)}' > ${prefix}_exonic_parts.psi 
	echo "PSI calculating finished!"
}


function main {
	command=$1
	case $command in 
	CountInclusion)
		[ $# -lt 4 ] &&  showHelp
	 	annotation=$2
		reads=$3
		prefix=$4
		CountInclusion
		;;
	CountExclusion)
		[ $# -lt 4 ] && showHelp
		annotation=$2 
		junctions=$3
		prefix=$4
		junctionFilter 
		junctions=${prefix}_filtered_junctions.bed
		CountExclusion  
		;;
	getPSI)
		[ $# -lt 5 ] && showHelp
		readLength=$2 
		inclusion=$3
		exclusion=$4
		prefix=$5
		CountPSI 
		;; 
	StartPSI)
		[ $# -lt 6 ] && showHelp
		annotation=$2
		readLength=$3 
		reads=$4
		junctions=$5 
		prefix=$6 
		CountInclusion
		junctionFilter
		junctions=${prefix}_filtered_junctions.bed 
		CountExclusion
		inclusion=${prefix}_exonic_parts.inclusion
		exclusion=${prefix}_exonic_parts.exclusion 
		CountPSI 
		;;
	StartPSINoFilter)
		[ $# -lt 6 ] && showHelp
		annotation=$2
		readLength=$3 
		reads=$4
		junctions=$5 
		prefix=$6 
		CountInclusion
		CountExclusion
		inclusion=${prefix}_exonic_parts.inclusion
		exclusion=${prefix}_exonic_parts.exclusion 
		CountPSI 
		;;
	*)
		echo "Wrong options !"
		showHelp 	 
	esac 
}

## Running 
main $* 
